Clean data
dat$gender <- as.factor(dat$gender)
dat$marital_status <- as.factor(dat$marital_status)
dat$category <- as.factor(dat$category)
dat$class <- as.factor(dat$class)
dat$status <- as.factor(dat$survived)
dat$embarked <- as.factor(dat$embarked)
dat$disembarked <- as.factor(dat$disembarked)
dat <- dat %>%
mutate(nationality2 = case_when(nationality == "English" ~ "English",
nationality == "Irish" ~ "Irish",
nationality == "American" ~ "American",
nationality == "Swedish" ~ "Swedish",
nationality == "Finnish" ~ "Finnish",
nationality == "Scottish" ~ "Scottish",
nationality == "French" ~ "French",
nationality == "Italian" ~ "Italian",
nationality == "Canadian" ~ "Canadian",
nationality == "Bulgarian" ~ "Bulgarian",
nationality == "Croatian" ~ "Croatian",
nationality == "Belgian" ~ "Belgian",
nationality == "Norwegian" ~ "Norwegian",
nationality == "Channel Islander" ~ "Channel Islander",
nationality == "Welsh" ~ "Welsh",
nationality == "Swiss" ~ "Swiss",
nationality == "German" ~ "German",
nationality == "Danish" ~ "Danish",
nationality == "Spanish" ~ "Spanish",
nationality == "Australian" ~ "Australian",
nationality == "Polish" ~ "Polish",
nationality == "South African" ~ "South African",
nationality == "Bosnian" ~ "Bosnian",
nationality == "Hong Kongese" ~ "Hong Kongese",
nationality == "Dutch" ~ "Dutch",
nationality == "Lithuanian" ~ "Lithuanian",
nationality == "Greek" ~ "Greek",
nationality == "Portuguese" ~ "Portuguese",
nationality == "Uruguayan" ~ "Uruguayan",
nationality == "Chinese" ~ "Chinese",
nationality == "Slovenian" ~ "Slovenian",
nationality == "Cape Verdean" ~ "Cape Verdean",
nationality == "Egyptian" ~ "Egyptian",
nationality == "Japanese" ~ "Japanese",
nationality == "Hungarian" ~ "Hungarian",
nationality == "Bosnian" ~ "Bosnian",
nationality == "Hong Kongese" ~ "Hong Kongese",
nationality == "Latvian" ~ "Latvian",
nationality == "Austrian" ~ "Austrian",
nationality == "Greek" ~ "Greek",
nationality == "Mexican" ~ "Mexican",
nationality == "Sweden" ~ "Sweedish",
nationality == "Turkish" ~ "Turkish",
nationality == "Slovenian" ~ "Slovenian",
nationality == "Guyanese" ~ "Guyanese",
nationality == "Haitian" ~ "Haitian",
nationality == "Syrian,Lebanese" ~ "Syrian/Lebanese",
nationality == "Unknown" ~ "Unknown",
TRUE ~ "Other - Multiple", ))
dat <- dat %>%
mutate(nationality2 = ifelse(nationality2 == "Unknown", NA, nationality2))
Descriptives
# Breakdown of passengers by class and gender
dat %>%
filter(category == "Passenger") %>%
filter(!is.na(gender)) %>%
group_by(class, gender) %>%
summarize(count = n()) %>%
mutate(percent = (count/sum(count))*100) %>%
kable(caption = "Breakdown of Passengers by Class and Gender",
col.names = c("Class", "Gender", "Count", "Percent"),
digits = 2,
booktabs = TRUE) %>%
kable_styling()
## `summarise()` has grouped output by 'class'. You can override using the `.groups` argument.
Breakdown of Passengers by Class and Gender
|
Class
|
Gender
|
Count
|
Percent
|
|
1st Class
|
Female
|
153
|
43.71
|
|
1st Class
|
Male
|
197
|
56.29
|
|
2nd Class
|
Female
|
112
|
38.36
|
|
2nd Class
|
Male
|
180
|
61.64
|
|
3rd Class
|
Female
|
216
|
30.47
|
|
3rd Class
|
Male
|
493
|
69.53
|
# Breakdown of passenger nationalities
dat %>%
filter(!is.na(nationality2)) %>%
group_by(nationality2) %>%
summarize(count = n()) %>%
mutate(percent = (count/sum(count))*100) %>%
arrange(desc(percent)) %>%
kable(caption = "Breakdown of Passenger Nationalities",
col.names = c("Nationality", "Count", "Percent"),
digits = 2,
booktabs = TRUE) %>%
kable_styling()
Breakdown of Passenger Nationalities
|
Nationality
|
Count
|
Percent
|
|
English
|
1037
|
42.36
|
|
Irish
|
361
|
14.75
|
|
American
|
246
|
10.05
|
|
Other - Multiple
|
116
|
4.74
|
|
Swedish
|
99
|
4.04
|
|
Syrian/Lebanese
|
86
|
3.51
|
|
Finnish
|
58
|
2.37
|
|
Scottish
|
49
|
2.00
|
|
French
|
44
|
1.80
|
|
Italian
|
41
|
1.67
|
|
Canadian
|
39
|
1.59
|
|
Bulgarian
|
33
|
1.35
|
|
Croatian
|
28
|
1.14
|
|
Belgian
|
26
|
1.06
|
|
Norwegian
|
26
|
1.06
|
|
Channel Islander
|
25
|
1.02
|
|
Welsh
|
23
|
0.94
|
|
Swiss
|
22
|
0.90
|
|
German
|
14
|
0.57
|
|
Danish
|
11
|
0.45
|
|
Spanish
|
9
|
0.37
|
|
Australian
|
7
|
0.29
|
|
Polish
|
6
|
0.25
|
|
South African
|
5
|
0.20
|
|
Bosnian
|
4
|
0.16
|
|
Hong Kongese
|
4
|
0.16
|
|
Dutch
|
3
|
0.12
|
|
Greek
|
3
|
0.12
|
|
Lithuanian
|
3
|
0.12
|
|
Uruguayan
|
3
|
0.12
|
|
Chinese
|
2
|
0.08
|
|
Portuguese
|
2
|
0.08
|
|
Slovenian
|
2
|
0.08
|
|
Austrian
|
1
|
0.04
|
|
Cape Verdean
|
1
|
0.04
|
|
Egyptian
|
1
|
0.04
|
|
Guyanese
|
1
|
0.04
|
|
Haitian
|
1
|
0.04
|
|
Hungarian
|
1
|
0.04
|
|
Japanese
|
1
|
0.04
|
|
Latvian
|
1
|
0.04
|
|
Mexican
|
1
|
0.04
|
|
Sweedish
|
1
|
0.04
|
|
Turkish
|
1
|
0.04
|
# Breakdown of passenger nationalities by class (all)
dat %>%
filter(category == "Passenger") %>%
filter(!is.na(nationality2)) %>%
group_by(class, nationality2) %>%
summarize(count = n()) %>%
mutate(percent = (count/sum(count))*100) %>%
arrange(class, desc(percent)) %>%
kable(caption = "Breakdown of Passenger Nationalities by Class (All)",
col.names = c("Class", "Nationality", "Count", "Percent"),
digits = 2,
booktabs = TRUE) %>%
kable_styling()
## `summarise()` has grouped output by 'class'. You can override using the `.groups` argument.
Breakdown of Passenger Nationalities by Class (All)
|
Class
|
Nationality
|
Count
|
Percent
|
|
1st Class
|
American
|
195
|
57.35
|
|
1st Class
|
English
|
51
|
15.00
|
|
1st Class
|
Canadian
|
27
|
7.94
|
|
1st Class
|
Other - Multiple
|
14
|
4.12
|
|
1st Class
|
French
|
10
|
2.94
|
|
1st Class
|
Irish
|
6
|
1.76
|
|
1st Class
|
Swiss
|
6
|
1.76
|
|
1st Class
|
German
|
5
|
1.47
|
|
1st Class
|
Scottish
|
5
|
1.47
|
|
1st Class
|
Spanish
|
4
|
1.18
|
|
1st Class
|
Swedish
|
4
|
1.18
|
|
1st Class
|
Uruguayan
|
3
|
0.88
|
|
1st Class
|
Belgian
|
2
|
0.59
|
|
1st Class
|
Italian
|
2
|
0.59
|
|
1st Class
|
Channel Islander
|
1
|
0.29
|
|
1st Class
|
Dutch
|
1
|
0.29
|
|
1st Class
|
Egyptian
|
1
|
0.29
|
|
1st Class
|
Mexican
|
1
|
0.29
|
|
1st Class
|
Norwegian
|
1
|
0.29
|
|
1st Class
|
Polish
|
1
|
0.29
|
|
2nd Class
|
English
|
145
|
51.06
|
|
2nd Class
|
Other - Multiple
|
25
|
8.80
|
|
2nd Class
|
American
|
24
|
8.45
|
|
2nd Class
|
Channel Islander
|
12
|
4.23
|
|
2nd Class
|
Irish
|
12
|
4.23
|
|
2nd Class
|
French
|
11
|
3.87
|
|
2nd Class
|
Scottish
|
8
|
2.82
|
|
2nd Class
|
Finnish
|
6
|
2.11
|
|
2nd Class
|
Swedish
|
6
|
2.11
|
|
2nd Class
|
Canadian
|
5
|
1.76
|
|
2nd Class
|
South African
|
4
|
1.41
|
|
2nd Class
|
Spanish
|
4
|
1.41
|
|
2nd Class
|
Danish
|
3
|
1.06
|
|
2nd Class
|
Italian
|
3
|
1.06
|
|
2nd Class
|
Lithuanian
|
2
|
0.70
|
|
2nd Class
|
Swiss
|
2
|
0.70
|
|
2nd Class
|
Syrian/Lebanese
|
2
|
0.70
|
|
2nd Class
|
Welsh
|
2
|
0.70
|
|
2nd Class
|
Australian
|
1
|
0.35
|
|
2nd Class
|
Belgian
|
1
|
0.35
|
|
2nd Class
|
German
|
1
|
0.35
|
|
2nd Class
|
Haitian
|
1
|
0.35
|
|
2nd Class
|
Hungarian
|
1
|
0.35
|
|
2nd Class
|
Japanese
|
1
|
0.35
|
|
2nd Class
|
Norwegian
|
1
|
0.35
|
|
2nd Class
|
Portuguese
|
1
|
0.35
|
|
3rd Class
|
English
|
112
|
15.80
|
|
3rd Class
|
Irish
|
105
|
14.81
|
|
3rd Class
|
Swedish
|
89
|
12.55
|
|
3rd Class
|
Syrian/Lebanese
|
83
|
11.71
|
|
3rd Class
|
Other - Multiple
|
69
|
9.73
|
|
3rd Class
|
Finnish
|
52
|
7.33
|
|
3rd Class
|
Bulgarian
|
33
|
4.65
|
|
3rd Class
|
Croatian
|
28
|
3.95
|
|
3rd Class
|
Norwegian
|
24
|
3.39
|
|
3rd Class
|
American
|
23
|
3.24
|
|
3rd Class
|
Belgian
|
22
|
3.10
|
|
3rd Class
|
Danish
|
7
|
0.99
|
|
3rd Class
|
Scottish
|
6
|
0.85
|
|
3rd Class
|
Welsh
|
6
|
0.85
|
|
3rd Class
|
Canadian
|
5
|
0.71
|
|
3rd Class
|
French
|
5
|
0.71
|
|
3rd Class
|
Polish
|
5
|
0.71
|
|
3rd Class
|
Swiss
|
5
|
0.71
|
|
3rd Class
|
Bosnian
|
4
|
0.56
|
|
3rd Class
|
Hong Kongese
|
4
|
0.56
|
|
3rd Class
|
Italian
|
4
|
0.56
|
|
3rd Class
|
Greek
|
3
|
0.42
|
|
3rd Class
|
Channel Islander
|
2
|
0.28
|
|
3rd Class
|
Chinese
|
2
|
0.28
|
|
3rd Class
|
German
|
2
|
0.28
|
|
3rd Class
|
Slovenian
|
2
|
0.28
|
|
3rd Class
|
Australian
|
1
|
0.14
|
|
3rd Class
|
Austrian
|
1
|
0.14
|
|
3rd Class
|
Latvian
|
1
|
0.14
|
|
3rd Class
|
Lithuanian
|
1
|
0.14
|
|
3rd Class
|
Portuguese
|
1
|
0.14
|
|
3rd Class
|
Sweedish
|
1
|
0.14
|
|
3rd Class
|
Turkish
|
1
|
0.14
|
# Breakdown of passenger nationalities by class (>= 5%)
dat %>%
filter(category == "Passenger") %>%
filter(!is.na(nationality2)) %>%
group_by(class, nationality2) %>%
summarize(count = n()) %>%
mutate(percent = (count/sum(count))*100) %>%
filter(percent >= 5) %>%
arrange(class, desc(percent)) %>%
kable(caption = "Breakdown of Passenger Nationalities by Class (>= 5%)",
col.names = c("Class", "Nationality", "Count", "Percent"),
digits = 2,
booktabs = TRUE) %>%
kable_styling()
## `summarise()` has grouped output by 'class'. You can override using the `.groups` argument.
Breakdown of Passenger Nationalities by Class (>= 5%)
|
Class
|
Nationality
|
Count
|
Percent
|
|
1st Class
|
American
|
195
|
57.35
|
|
1st Class
|
English
|
51
|
15.00
|
|
1st Class
|
Canadian
|
27
|
7.94
|
|
2nd Class
|
English
|
145
|
51.06
|
|
2nd Class
|
Other - Multiple
|
25
|
8.80
|
|
2nd Class
|
American
|
24
|
8.45
|
|
3rd Class
|
English
|
112
|
15.80
|
|
3rd Class
|
Irish
|
105
|
14.81
|
|
3rd Class
|
Swedish
|
89
|
12.55
|
|
3rd Class
|
Syrian/Lebanese
|
83
|
11.71
|
|
3rd Class
|
Other - Multiple
|
69
|
9.73
|
|
3rd Class
|
Finnish
|
52
|
7.33
|
# Average age by class
dat %>%
filter(category == "Passenger") %>%
filter(!is.na(age)) %>%
group_by(class) %>%
summarize(avg_age = mean(age), min_age = min(age), max_age = max(age)) %>%
kable(caption = "Average Age by Class",
col.names = c("Class", "Average Age", "Minimum Age", "Maximum Age"),
digits = 2,
booktabs = TRUE) %>%
kable_styling()
Average Age by Class
|
Class
|
Average Age
|
Minimum Age
|
Maximum Age
|
|
1st Class
|
39.12
|
0
|
71
|
|
2nd Class
|
30.01
|
0
|
71
|
|
3rd Class
|
25.12
|
0
|
74
|
# Survival rate by class
dat %>%
filter(category == "Passenger") %>%
filter(!is.na(survived)) %>%
group_by(class, survived) %>%
summarize(count = n()) %>%
mutate(percent = (count/sum(count))*100) %>%
arrange(class, survived) %>%
kable(caption = "Survival Rate by Class",
col.names = c("Class", "Survived", "Count", "Percent"),
digits = 2,
booktabs = TRUE) %>%
kable_styling()
## `summarise()` has grouped output by 'class'. You can override using the `.groups` argument.
Survival Rate by Class
|
Class
|
Survived
|
Count
|
Percent
|
|
1st Class
|
Lost
|
123
|
37.96
|
|
1st Class
|
Saved
|
201
|
62.04
|
|
2nd Class
|
Lost
|
166
|
58.45
|
|
2nd Class
|
Saved
|
118
|
41.55
|
|
3rd Class
|
Lost
|
528
|
74.47
|
|
3rd Class
|
Saved
|
181
|
25.53
|
# Survival rate by gender
dat %>%
filter(category == "Passenger") %>%
filter(!is.na(survived)) %>%
group_by(gender, survived) %>%
summarize(count = n()) %>%
mutate(percent = (count/sum(count))*100) %>%
arrange(gender, survived) %>%
kable(caption = "Survival Rate by Gender",
col.names = c("Gender", "Survived", "Count", "Percent"),
digits = 2,
booktabs = TRUE) %>%
kable_styling()
## `summarise()` has grouped output by 'gender'. You can override using the `.groups` argument.
Survival Rate by Gender
|
Gender
|
Survived
|
Count
|
Percent
|
|
Female
|
Lost
|
127
|
27.25
|
|
Female
|
Saved
|
339
|
72.75
|
|
Male
|
Lost
|
690
|
81.08
|
|
Male
|
Saved
|
161
|
18.92
|
# Survival rate by class and gender
dat %>%
filter(category == "Passenger") %>%
filter(!is.na(survived)) %>%
group_by(class, gender, survived) %>%
summarize(count = n()) %>%
mutate(percent = (count/sum(count))*100) %>%
arrange(class, gender) %>%
kable(caption = "Survival Rate by Class and Gender",
col.names = c("Class", "Gender", "Survived", "Count", "Percent"),
digits = 2,
booktabs = TRUE) %>%
kable_styling()
## `summarise()` has grouped output by 'class', 'gender'. You can override using the `.groups` argument.
Survival Rate by Class and Gender
|
Class
|
Gender
|
Survived
|
Count
|
Percent
|
|
1st Class
|
Female
|
Lost
|
5
|
3.47
|
|
1st Class
|
Female
|
Saved
|
139
|
96.53
|
|
1st Class
|
Male
|
Lost
|
118
|
65.56
|
|
1st Class
|
Male
|
Saved
|
62
|
34.44
|
|
2nd Class
|
Female
|
Lost
|
12
|
11.32
|
|
2nd Class
|
Female
|
Saved
|
94
|
88.68
|
|
2nd Class
|
Male
|
Lost
|
154
|
86.52
|
|
2nd Class
|
Male
|
Saved
|
24
|
13.48
|
|
3rd Class
|
Female
|
Lost
|
110
|
50.93
|
|
3rd Class
|
Female
|
Saved
|
106
|
49.07
|
|
3rd Class
|
Male
|
Lost
|
418
|
84.79
|
|
3rd Class
|
Male
|
Saved
|
75
|
15.21
|
Density ridges
surv_classhist <- dat %>%
filter(category == "Passenger") %>%
filter(!is.na(age)) %>%
filter(!is.na(survived)) %>%
ggplot(aes(age, class)) +
geom_density_ridges(aes(fill = factor(survived))) +
labs(title = "Age Distribution of Survival Status By Class",
x = "Age Distribution", y = "Passenger Class") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))
surv_classhist + scale_fill_manual(name = "Survival", values = c("black","dark red"))
## Picking joint bandwidth of 3.69

surv_agehist <- dat %>%
filter(category == "Passenger") %>%
filter(!is.na(age)) %>%
filter(!is.na(survived)) %>%
ggplot(aes(age, gender)) +
geom_density_ridges(aes(fill = factor(survived))) +
labs(title = "Age Distribution of Survival Status By Gender",
x = "Age Distribution", y = "Passenger Gender") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))
surv_agehist + scale_fill_manual(name = "Survival", values = c("black","dark red"))
## Picking joint bandwidth of 3.88

surv_ageclass_hist <- dat %>%
filter(category == "Passenger") %>%
filter(!is.na(age)) %>%
filter(!is.na(survived)) %>%
ggplot(aes(age, gender)) +
facet_wrap(~class, nrow=3) +
geom_density_ridges(aes(fill = factor(survived))) +
labs(title = "Age Distribution of Survival Status By Class and Gender",
x = "Age Distribution", y = "Passenger Gender") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))
surv_ageclass_hist + scale_fill_manual(name = "Survival", values = c("black","dark red"))
## Picking joint bandwidth of 6.54
## Picking joint bandwidth of 5.57
## Picking joint bandwidth of 2.96
